Common CRAN and GitHub Packages


In [38]:
import pandas
from matplotlib import pyplot as plt

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)

In [39]:
for datefield in ['Date', 'CommitDate', 'CRANRelease', 'SnapshotFirstDate', 'SnapshotLastDate', 'BiocDate']:
    data[datefield] = pandas.to_datetime(data[datefield])

data = data.query('Date < "2015-01-01"')
github = data.query('Source == "github"')
cran = data.query('Source == "cran"')

In [43]:
first_github = github.sort('Date').drop_duplicates('Package')
first_cran = cran.sort('CRANRelease').drop_duplicates('Package')
common = first_github.merge(first_cran, how='inner', on='Package', suffixes=('_github', '_cran'))

In [44]:
len(first_github), len(first_cran), len(common)


Out[44]:
(5664, 6574, 1184)

In [57]:
ax = first_github.set_index('Date').resample('1M', how='count')[['Package']].cumsum().plot(figsize=(15,6))
ax = first_cran.set_index('CRANRelease').resample('1M', how='count')[['Package']].cumsum().plot(ax=ax, xlim=('2009-01-01', None))
ax.legend(['Github', 'CRAN Release'], loc='best')
ax.set_xlabel('Date')


Out[57]:
<matplotlib.text.Text at 0x7fe679bbb5d0>

In [42]: